######################
#rii_build
#paul stainier
#started 2/14/2020
#create distid3 monthly nss variables
#merge with relevant weather
#####################

rm(list = ls())
library(plyr)
library(dplyr)
library(forestmangr)
library(data.table)
library(readr)
library(collapse)
library(rlang)
library(haven)

#########################
#toggle T/F for the part that 
#you want to run 
#########################
#schedule 1: 
schedule_1_tf <- F
collapse_items <- schedule_1_tf
collapse_individuals <- schedule_1_tf
merge_household_nss <- schedule_1_tf
merge_weather <- schedule_1_tf
#all schedule 1 code must happen together


#schedule 10:
schedule_10_tf <- F
collapse_activities10 <- schedule_10_tf
merge_individual10 <- schedule_10_tf
merge_household10 <- schedule_10_tf
merge_weather10 <- schedule_10_tf
#all schedule 10 code must happen together


#nfhs:
merge_weather_nfhs <- F

#icrisat:
merge_weather_icrisat <- T

#########################
#replace with own directories before running
#########################
input_directory <- 
output_directory <- 
crosswalk_directory <- 
working_crosswalk_directory <- 
sum_directory <- 




######################
#collapse_items
#merge item data with nutrient content info
#collapse it to the household level
#####################
#schedule 1
{
if(collapse_items){
    setwd(input_directory)
    nss_item_data <- read_csv("nss_5968all_item_key.csv")
    #drop 56-58
    #remove the non-food items
    nss_item_data <- nss_item_data[which(nss_item_data$item_code < 335),]
    #remove meals. these are captured in the individual data as meals from various sources 
    #they are part of the carpena adjustment factor for number of meals
    #including these would amount to double counting
    nss_item_data <- nss_item_data[which(!(nss_item_data$item_code == 303 & nss_item_data$round <= 63)),]
    nss_item_data <- nss_item_data[which(!(nss_item_data$item_code >= 302 & nss_item_data$item_code <= 303 & nss_item_data$round == 64)),]
    nss_item_data <- nss_item_data[which(!(nss_item_data$item_code == 303 & nss_item_data$round == 61)),]
    nss_item_data <- nss_item_data[which(!(nss_item_data$item_code == 302 & nss_item_data$round == 66)),]
    nss_item_data <- nss_item_data[which(!(nss_item_data$item_code >= 302 & nss_item_data$item_code <= 303 & nss_item_data$round == 66)),]
    nss_item_data <- nss_item_data[which(!(nss_item_data$item_code >= 280 & nss_item_data$item_code <= 289 & nss_item_data$round == 68)),]
    
    #make home produced quantities/values 0 instead of NA
    nss_item_data$home_prod_qty[which(is.na(nss_item_data$home_prod_qty))] <- 0
    nss_item_data$home_prod_val[which(is.na(nss_item_data$home_prod_val))] <- 0 
    
    nutrition_data <- read_csv("nutrition_info_rounditem_key_allrounds.csv")
    nutrition_data <- nutrition_data[which(nutrition_data$rural == 1),]
    nutrition_data <- dplyr::select(nutrition_data, -unit, -rural, -food_group, 
                                    -diversity_count, -heme, -nonveg, -foodgroupname,
                                    -notes, -gopalan_code)
    
    
    
    #merge the nutrition data with the item data and get a total nutrient content
    nss_item_data <- merge(nss_item_data, nutrition_data, by = c("round", "item_code"))
    #the data are recorded slightly differently in some rounds 
    #account for those changes 
    #in all rounds, if change = 1, that means its nutritional facts are recorded in rupees, 
    #so change the consumption quantity equal to the consumption value  
    #in round 66, some items are recorded in kg but their nutritional facts are in grams, so multiply by 1000
    #in round 68, items are recorded in grams but most nutrition facts are in kg, so divide by 1000
    nss_item_data <- mutate(
      nss_item_data,
      total_cons_qty = ifelse(change == 1, total_cons_val, total_cons_qty),
      total_cons_qty = ifelse(round == 66 & change == 2, total_cons_qty * 1000, total_cons_qty),
      total_cons_qty = ifelse(round == 68 & change == 0, total_cons_qty / 1000, total_cons_qty),
      
      home_prod_qty = ifelse(change == 1, home_prod_val, home_prod_qty),
      home_prod_qty = ifelse(round == 66 & change == 2, home_prod_qty * 1000, home_prod_qty),
      home_prod_qty = ifelse(round == 68 & change == 0, home_prod_qty / 1000, home_prod_qty)
    )
    
    #############
    #get the contribution from each item 
    #to each nutrient
    #############

    nss_item_data <- mutate(
      nss_item_data,
      #total quantities
      calories = total_cons_qty * calories_per_unit_kcal,
      fat = total_cons_qty * fat_per_unit_gm,
      protein = total_cons_qty * protein_per_unit_gm,
      iron = total_cons_qty * iron_per_unit_mg,
      zinc = total_cons_qty * zinc_per_unit_mg,
      thiamine = total_cons_qty * thiamine_per_unit_mg,
      riboflavin = total_cons_qty * riboflavin_per_unit_mg,
      niacin = total_cons_qty * niacin_per_unit_mg,
      ascorbic_acid = total_cons_qty * ascorbic_acid_per_unit_mg,
      
      
      # Home produced quantities
      calories_home = home_prod_qty * calories_per_unit_kcal,
      fat_home = home_prod_qty * fat_per_unit_gm,
      protein_home = home_prod_qty * protein_per_unit_gm,
      iron_home = home_prod_qty * iron_per_unit_mg,
      zinc_home = home_prod_qty * zinc_per_unit_mg,
      thiamine_home = home_prod_qty * thiamine_per_unit_mg,
      riboflavin_home = home_prod_qty * riboflavin_per_unit_mg,
      niacin_home = home_prod_qty * niacin_per_unit_mg,
      ascorbic_acid_home = home_prod_qty * ascorbic_acid_per_unit_mg,
      
      #purchase value of food
      purchase_val = total_cons_val - home_prod_val
    )
    
    ##########
    #get summary of which 
    #items contribute most 
    #to each nutrient
    #########

    nutrients_by_item <- collap(nss_item_data, 
                                calories + protein + iron  +
                                  zinc + thiamine + riboflavin + 
                                  niacin + ascorbic_acid + 
                                  calories_home + protein_home + 
                                  iron_home + zinc_home + thiamine_home + riboflavin_home + 
                                  niacin_home + ascorbic_acid_home  ~ item , FUN = fsum, na.rm = T)
    
    setwd(sum_directory)
    write_csv(nutrients_by_item, "nutrient_consumption_by_item.csv")
    
    
    ##########
    #collapse to the household level
    #########
   
    #fsum ignores NA values by default
    nss_item_data <- collap(nss_item_data, 
                              calories + protein + iron  +
                              zinc + thiamine + riboflavin + 
                              niacin + ascorbic_acid + 
                              calories_home + protein_home + 
                              iron_home + zinc_home + thiamine_home + riboflavin_home + 
                              niacin_home + ascorbic_acid_home ~ 
                              house_id + round, FUN = fsum)
    
    setnames(nss_item_data, c("calories" , "protein" , "iron"  ,
                     "zinc" , "thiamine" , "riboflavin" , 
                     "niacin" , "ascorbic_acid" , 
                     "calories_home" , "protein_home" , 
                     "iron_home",  "zinc_home" , "thiamine_home" , "riboflavin_home" , 
                     "niacin_home",  "ascorbic_acid_home"), 
             c("total_calories" , "total_protein" , "total_iron"  ,
               "total_zinc" , "total_thiamine" , "total_riboflavin" , 
               "total_niacin" , "total_ascorbic_acid" , 
               "total_calories_home" , "total_protein_home" , 
               "total_iron_home",  "total_zinc_home" , "total_thiamine_home" , "total_riboflavin_home" , 
               "total_niacin_home",  "total_ascorbic_acid_home"))
    
    
    
    nss_item_data <- mutate(
      nss_item_data,
      calories_home_share = total_calories_home / total_calories,
      protein_home_share = total_protein_home / total_protein,
      iron_home_share = total_iron_home / total_iron,
      zinc_home_share = total_zinc_home / total_zinc,
      thiamine_home_share = total_thiamine_home / total_thiamine,
      riboflavin_home_share = total_riboflavin_home / total_riboflavin,
      niacin_home_share = total_niacin_home / total_niacin,
      ascorbic_acid_home_share = total_ascorbic_acid_home / total_ascorbic_acid
    )

    
    rm(list = "nutrition_data")
  }
  
if(collapse_individuals){
    setwd(input_directory)
    #load the individual data and change the age of infants and adults to 0 and 99 (no separation)
    nss_individual_data <- read_csv("nss_5968all_individual_key.csv")
    nss_individual_data$weight <- NULL
    
    #merge with the household data to get nco
    nss_household_data <- read_csv("nss_5968all_household_key.csv")
    nss_household_data <- nss_household_data[which(nss_household_data$round > 58),]
    nss_household_data <- dplyr::select(nss_household_data, house_id , round, nco)
    nss_individual_data <- merge(nss_individual_data, nss_household_data, by = c("house_id", "round"), all.x = T)
    
    work_effort_data <- read_csv("nco_effort.csv")
    work_effort_data <- dplyr::select(work_effort_data, nco , round, effort_cat)
    nss_individual_data <- merge(nss_individual_data, work_effort_data, by = c("nco", "round"), all.x = T)
    nss_individual_data$nco <- NULL
    nss_individual_data$effort_cat[which(is.na(nss_individual_data$effort_cat))] <- 1
    nss_individual_data$effort_cat[which(nss_individual_data$age < 18)] <- 0
    
    #household composition 
    #remove anyone who is away the entire time
    nss_individual_data <- mutate(
      nss_individual_data,
      days_away = replace(days_away, is.na(days_away), 0),
      adult_man = ifelse(sex == 1 & age >= 18 & days_away < 30, 1, 0),
      adult_woman = ifelse(sex == 2 & age >= 18 & days_away < 30, 1, 0),
      teenage_boy = ifelse(sex == 1 & age >= 13 & age <= 17 & days_away < 30, 1, 0),
      teenage_girl = ifelse(sex == 2 & age >= 13 & age <= 17 & days_away < 30, 1, 0),
      child_boy = ifelse(sex == 1 & age >= 5 & age <= 12 & days_away < 30, 1, 0),
      child_girl = ifelse(sex == 2 & age >= 5 & age <= 12 & days_away < 30, 1, 0),
      infant_boy = ifelse(sex == 1 & age <= 4 & days_away < 30, 1, 0),
      infant_girl = ifelse(sex == 2 & age <= 4 & days_away < 30, 1, 0),
      
      #add age categories 
      adult = ifelse(age >= 18 & days_away < 30, 1, 0),
      adult_lt30 = ifelse(age >= 18 & age < 30 & days_away < 30, 1, 0),
      adult_30_40 = ifelse(age >= 30 & age < 40 & days_away < 30, 1, 0),
      adult_40_50 = ifelse(age >= 40 & age < 50 & days_away < 30, 1, 0),
      adult_50_60 = ifelse(age >= 50 & age < 60 & days_away < 30, 1, 0),
      adult_gte60 = ifelse(age >= 60 & days_away < 30, 1, 0),
      adult_gte50 = ifelse(age >= 50 & days_away < 30, 1, 0),
      
      #add age categories by man vs woman
      #man
      man_lt30 = ifelse(age >= 18 & age < 30 & sex == 1 & days_away < 30, 1, 0),
      man_30_40 = ifelse(age >= 30 & age < 40 & sex == 1 & days_away < 30, 1, 0),
      man_40_50 = ifelse(age >= 40 & age < 50 & sex == 1 & days_away < 30, 1, 0),
      man_50_60 = ifelse(age >= 50 & age < 60 & sex == 1 & days_away < 30, 1, 0),
      man_gte60 = ifelse(age >= 60 & sex == 1 & days_away < 30, 1, 0),
      man_gte50 = ifelse(age >= 50 & sex == 1 & days_away < 30, 1, 0),
      
      #woman
      woman_lt30 = ifelse(age >= 18 & age < 30 & sex == 2 & days_away < 30, 1, 0),
      woman_30_40 = ifelse(age >= 30 & age < 40 & sex == 2 & days_away < 30, 1, 0),
      woman_40_50 = ifelse(age >= 40 & age < 50 & sex == 2 & days_away < 30, 1, 0),
      woman_50_60 = ifelse(age >= 50 & age < 60 & sex == 2 & days_away < 30, 1, 0),
      woman_gte60 = ifelse(age >= 60 & sex == 2 & days_away < 30, 1, 0),
      woman_gte50 = ifelse(age >= 50 & sex == 2 & days_away < 30, 1, 0),
      
      #marital status 
      married_adult = ifelse(marital_stat == 2 & age >= 18 & days_away < 30, 1, 0),
      unmarried_adult = ifelse(marital_stat != 2  & age >= 18 & days_away < 30, 1, 0),
      adult = ifelse(age >= 18 & days_away < 30, 1, 0),
      married_man_lt30 = ifelse(sex == 1 & marital_stat == 2 & age >= 18 & age < 30 & days_away < 30, 1, 0),
      unmarried_man_lt30 = ifelse(sex == 1 & marital_stat != 2 & age >= 18 & age < 30 & days_away < 30, 1, 0),
      married_woman_lt30 = ifelse(sex == 2 & marital_stat == 2 & age >= 18 & age < 30 & days_away < 30, 1, 0),
      unmarried_woman_lt30 = ifelse(sex == 2 & marital_stat != 2 & age >= 18 & age < 30 & days_away < 30, 1, 0),
      
      #when summing up, this will count the number of people in the house
      num_people = ifelse(days_away < 30, 1, 0)
    )
    
    #adults of the same gender have the same listed needs regardless of the age
    nss_individual_data$age[which(nss_individual_data$age >= 18)] <- 99
    
    
    #prepare nutritional needs file
    nutritional_needs <- read_csv("nutritional_needs_agegroup_key.csv")
    nutritional_needs <- dplyr::select(nutritional_needs, -notes)
    `%notin%` <- Negate(`%in%`)
    #don't know about pregnancies - could figure out lactating if necessary
    omit_women <- c("pregnant (extra for calories, thiamine, riboflavin, niacin)", 
                    "lactating 0-6m (extra for calories, thiamine, riboflavin, niacin)", 
                    "lactating 6-12m (extra for calories, thiamine, riboflavin, niacin)")
    nutritional_needs <- nutritional_needs[which(nutritional_needs$age %notin% omit_women),]
    nutritional_needs$age <- as.numeric(as.numeric(as.character(nutritional_needs$age)))
    
    
    #merge the individual data with their needs, and multiply by 30 to create 30day needs
    nss_individual_data <- merge(nss_individual_data, nutritional_needs, by = c("sex", "age", "effort_cat"))
    nss_individual_data <- mutate(
      nss_individual_data,
      calories_need_kcal = calories_need_kcal * (30 - days_away),
      protein_need_g = protein_need_g * (30 - days_away),
      iron_need_mg = iron_need_mg * (30 - days_away),
      zinc_need_mg = zinc_need_mg * (30 - days_away),
      thiamine_need_mg = thiamine_need_mg * (30 - days_away),
      riboflavin_need_mg = riboflavin_need_mg * (30 - days_away),
      niacin_need_mg = niacin_need_mg * (30 - days_away),
      ascorbic_acid_need_mg = ascorbic_acid_need_mg * (30 - days_away)
    )
    
    #education of household head
    nss_individual_data$educ <- as.numeric(nss_individual_data$educ)
    max_head_educ <- nss_individual_data[which(nss_individual_data$relation_to_head == 1),] %>%
      group_by(round, house_id) %>%
      summarise(educ = max(educ, na.rm =  T))
    max_head_educ <- max_head_educ[is.finite(max_head_educ$educ),]
    
    setwd(crosswalk_directory)
    educ_crosswalk <- read_csv("educ_collapse_crosswalk.csv")
    max_head_educ <- merge(max_head_educ, educ_crosswalk, by = c("round", "educ"), all.x = T)
    
    #remove unnecessary columns
    nss_individual_data <- dplyr::select(nss_individual_data, -schedule, -relation_to_head, -sex, -age,
                                         -marital_stat, -educ, -person_num, -group)
    nss_individual_data <- nss_individual_data %>%
      group_by(round, house_id) %>% 
      summarise_all(sum, na.rm = T)
    
    #make outcome variables that only depend on individual data
    nss_individual_data <- mutate(nss_individual_data,
                                  #man to woman ratios
                                  adult_female_perc =  adult_woman / (adult_woman+adult_man)*100,
                                  child_female_perc = (teenage_girl + child_girl + infant_girl)/(teenage_boy + child_boy + infant_boy + teenage_girl + child_girl + infant_girl)*100,
                                  male_lt18 = teenage_boy + child_boy + infant_boy,
                                  female_lt18 = teenage_girl + child_girl + infant_girl,
                                  adult_lt30_female_perc = woman_lt30/(woman_lt30+man_lt30)*100,
                                  adult_30_40_female_perc = woman_30_40/(woman_30_40+man_30_40)*100,
                                  adult_40_50_female_perc = woman_40_50/(woman_40_50+man_40_50)*100,
                                  adult_50_60_female_perc = woman_50_60/(woman_50_60+man_50_60)*100,
                                  adult_lt60_female_perc = (woman_lt30 + woman_30_40 + woman_40_50 + woman_50_60)/(man_lt30 + man_30_40 + man_40_50 + man_50_60 + woman_lt30 + woman_30_40 + woman_40_50 + woman_50_60)*100,
                                  adult_gte60_female_perc = (woman_gte60)/(man_gte60 + woman_gte60)*100,
                                  married_adult_perc = married_adult/(married_adult + unmarried_adult)*100,
                                  adult_woman_lt60 = woman_lt30 + woman_30_40 + woman_40_50 + woman_50_60,
                                  adult_man_lt60 = man_lt30 + man_30_40 + man_40_50 + man_50_60
                                  )

    nss_individual_data <- merge(nss_individual_data, max_head_educ, by = c("round", "house_id"), all.x = T)
    nss_individual_data <- dplyr::select(nss_individual_data, -educ, -educ2_desc)
    colnames(nss_individual_data)[colnames(nss_individual_data) == 'educ2'] <- 'head_educ'
  }
  
if(merge_household_nss){
    setwd(input_directory)
    #load the household data and the get the corresponding year and month of the survey
    nss_household_data <- read_csv("nss_5968all_household_key.csv")
    nss_household_data <- nss_household_data[which(nss_household_data$sector == 1),]
    nss_household_data <- nss_household_data[!is.na(nss_household_data$surv_date),]
    nss_household_data$surv_date2 <- sprintf("%06d", nss_household_data$surv_date)
    nss_household_data <- nss_household_data[!is.na(nss_household_data$surv_date),]
    
    #merge with the individual data and the household data
    nss_household_data <- merge(nss_household_data, nss_individual_data, by = c("round", "house_id"))
    nss_household_data <- merge(nss_household_data, nss_item_data, by = c("round", "house_id"))
    
    #make the adjustment factor from carpena
    nss_household_data <- mutate(
      nss_household_data,
      meals_away = meals_empl + meals_pay + meals_school + meals_other,
      adjustment_factor = (meals_home + meals_away) / (meals_home + non_household_member_meals),
      adjustment_factor = replace(adjustment_factor, !is.finite(adjustment_factor), 1)
    )
    
    
    #adjust all the sums once and for all
    nss_household_data <- mutate(
      nss_household_data,
      total_calories = total_calories * adjustment_factor,
      total_protein = total_protein * adjustment_factor,
      total_iron = total_iron * adjustment_factor,
      total_zinc = total_zinc * adjustment_factor,
      total_thiamine = total_thiamine * adjustment_factor,
      total_riboflavin = total_riboflavin * adjustment_factor,
      total_niacin = total_niacin * adjustment_factor,
      total_ascorbic_acid = total_ascorbic_acid * adjustment_factor,
      total_calories_home = total_calories_home * adjustment_factor,
      total_protein_home = total_protein_home * adjustment_factor,
      total_iron_home = total_iron_home * adjustment_factor,
      total_zinc_home = total_zinc_home * adjustment_factor,
      total_thiamine_home = total_thiamine_home * adjustment_factor,
      total_riboflavin_home = total_riboflavin_home * adjustment_factor,
      total_niacin_home = total_niacin_home * adjustment_factor,
      total_ascorbic_acid_home = total_ascorbic_acid_home * adjustment_factor,
    )
    
    
    #calculate the nutrient ratios
    nss_household_data <- mutate(
      nss_household_data,
      calories_ratio = total_calories / calories_need_kcal,
      protein_ratio = total_protein / protein_need_g,
      iron_ratio = total_iron / iron_need_mg,
      zinc_ratio = total_zinc / zinc_need_mg,
      thiamine_ratio = total_thiamine / thiamine_need_mg,
      riboflavin_ratio = total_riboflavin / riboflavin_need_mg,
      niacin_ratio = total_niacin / niacin_need_mg,
      ascorbic_acid_ratio = total_ascorbic_acid / ascorbic_acid_need_mg
    )
    
    
    
    #calculate nutrients per capita 
    nss_household_data <- mutate(
      nss_household_data,
      calories_pc = total_calories / num_people,
      protein_pc = total_protein / num_people,
      iron_pc = total_iron / num_people,
      zinc_pc = total_zinc / num_people,
      thiamine_pc = total_thiamine / num_people,
      riboflavin_pc = total_riboflavin / num_people,
      niacin_pc = total_niacin / num_people,
      ascorbic_acid_pc = total_ascorbic_acid / num_people
    )
    
    #log calories and log iron
    #iron per calorie
    nss_household_data <- mutate(
      nss_household_data,
      log_calories_pc = log(calories_pc),
      log_iron_pc = log(iron_pc),
      log_zinc_pc = log(zinc_pc),
      log_protein_pc = log(protein_pc),
      log_thiamine_pc = log(thiamine_pc),
      log_riboflavin_pc = log(riboflavin_pc),
      log_niacin_pc = log(niacin_pc),
      log_ascorbic_acid_pc = log(ascorbic_acid_pc),
      iron_per_cal = total_iron/total_calories
    )
    
    #measure of how much of the diet "adequacy" is made up of the home vs. the purchased
    nss_household_data <- mutate(
      nss_household_data,
      total_iron_purchase = total_iron - total_iron_home,
      total_calories_purchase = total_calories - total_calories_home,
      total_zinc_purchase = total_zinc- total_zinc_home,
      total_protein_purchase = total_protein - total_protein_home,
      total_thiamine_purchase = total_thiamine - total_thiamine_home,
      total_riboflavin_purchase = total_riboflavin - total_riboflavin_home,
      total_niacin_purchase = total_niacin- total_niacin_home,
      total_ascorbic_acid_purchase = total_ascorbic_acid - total_ascorbic_acid_home,
    )

    #iron and calories from home per cap vs. calories purchased per cap
    nss_household_data <- mutate(
      nss_household_data,
      iron_home_pc = total_iron_home / num_people,
      iron_purchase_pc = total_iron_purchase / num_people,
      calories_home_pc = total_calories_home / num_people,
      calories_purchase_pc = total_calories_purchase / num_people,
      zinc_home_pc = total_zinc_home / num_people,
      zinc_purchase_pc = total_zinc_purchase / num_people,
      protein_home_pc = total_protein_home / num_people,
      protein_purchase_pc = total_protein_purchase / num_people,
      thiamine_home_pc = total_thiamine_home / num_people,
      thiamine_purchase_pc = total_thiamine_purchase / num_people,
      riboflavin_home_pc = total_riboflavin_home / num_people,
      riboflavin_purchase_pc = total_riboflavin_purchase / num_people,
      niacin_home_pc = total_niacin_home / num_people,
      niacin_purchase_pc = total_niacin_purchase / num_people,
      ascorbic_acid_home_pc = total_ascorbic_acid_home / num_people,
      ascorbic_acid_purchase_pc = total_ascorbic_acid_purchase / num_people
    )
    
    
    
    #religion dummies
    nss_household_data <- mutate(
      nss_household_data,
      hindu = ifelse(religion == 1, 1, 0),
      muslim = ifelse(religion == 2, 1, 0),
      christian = ifelse(religion == 3, 1, 0),
      sikh = ifelse(religion == 4, 1, 0),
      jain = ifelse(religion == 5, 1, 0),
      buddhist = ifelse(religion == 6, 1, 0),
      zoro = ifelse(religion == 7, 1, 0)
    )
    
    #social group dummies
    nss_household_data <- mutate(
      nss_household_data,
      s_tribe = ifelse(social_group == 1, 1, 0),
      s_caste = ifelse(social_group == 2, 1, 0),
      obc = ifelse(social_group == 3, 1, 0)
    )
    
    #agriculture yes or no
    nss_household_data <- mutate(
      nss_household_data,
      nic_cat = floor(nic / 1000),
      ag = ifelse(nic_cat == 1, 1, 0),
      manuf = ifelse(round < 68 & nic_cat >= 15 & nic_cat <= 37, 1, 0),
      const = ifelse(round < 68 & nic_cat == 45, 1, 0),
      manuf = ifelse(round == 68 & nic_cat >= 10 & nic_cat <= 33, 1, manuf),
      const = ifelse(round == 68 & nic_cat >= 41 & nic_cat <= 43, 1, const)
    )

 
    nss_household_data <- unique(nss_household_data)
    
  }
  
if(merge_weather){
    setwd(input_directory)
    prec_data <- read_csv("era5_prec_variables_2003_2012_distid3_year_key_all.csv")
    tmax_data <- read_csv("era5_tmax_variables_2003_2012_distid3_year_key_all.csv")
    nss_household_data <- merge(nss_household_data, tmax_data, by = c("distid3", "year"))
    nss_household_data <- merge(nss_household_data, prec_data, by = c("distid3", "year"))
    
    for(year_name in c("last_y", "this_y")){
      for(grow_name in c("grow", "nogrow", "all")){
        name_gt100 <- paste("tmax_gt100", ".", year_name, grow_name, sep= "")
        name_gt110 <- paste("tmax_gt110", ".", year_name, grow_name, sep= "")
        name_100_110 <- paste("tmax_100_110", ".", year_name, grow_name, sep= "")
        nss_household_data[,name_gt100] <- as.data.frame(nss_household_data[,name_100_110] + nss_household_data[,name_gt110])
      }
    }
    setwd(output_directory)
    write_csv(nss_household_data, "nss_nutrition_era5_weather_2003_2012_all_analysis.csv")
  }
}


#schedule10
{
  if(collapse_activities10){
    setwd(input_directory)
    nss_activity_data <- read_csv("nss10_6068_activity_key.csv")
    nss_activity_data <- nss_activity_data %>% group_by(person_id2, round) %>% mutate(activity_id = row_number())
    nss_activity_data$wage_cash[is.na(nss_activity_data$wage_cash)] <- 0 
    nss_activity_data <- mutate(nss_activity_data, 
                                activity_nic_cat = floor(week_nic /1000),
                                ag_wage = ifelse(activity_nic_cat == 1, wage_cash, 0),
                                nonag_wage = wage_cash - ag_wage)
    
    nss_activity_person_data <- collap(nss_activity_data, 
                                       ag_wage + nonag_wage ~ house_id + person_id2 + round, 
                                       FUN = fsum)
    rm(list = "nss_activity_data")
    nss_activity_person_data$ag_wage[which(is.na(nss_activity_person_data$ag_wage))] <- 0
    nss_activity_person_data$nonag_wage[which(is.na(nss_activity_person_data$nonag_wage))] <- 0
  }
  
  if(merge_individual10){
    setwd(input_directory)
    nss_individual_data <- read_csv("nss10_6068_individual_key.csv")
    nss_individual_data <- select(nss_individual_data, 
                                  -person_id)
    
    #create different principal activity categories
    nss_individual_data <- mutate(nss_individual_data, 
                                  unempl_principal = ifelse(principal_status == 81, 1, 0),
                                  in_school_principal = ifelse(principal_status == 91, 1, 0),
                                  could_work_principal = ifelse(principal_status < 83, 1, 0),
                                  working_principal = ifelse(principal_status < 52, 1, 0),
                                  working_at_home_principal = ifelse(principal_status < 22, 1, 0),
                                  working_away_principal = ifelse(principal_status >= 31 & principal_status <= 51, 1, 0),
                                  domestic_duties_principal = ifelse(principal_status == 92 | principal_status == 93, 1, 0),
                                  principal_nic_cat = floor(principal_nic/1000),
                                  ag_principal = ifelse(principal_nic_cat == 1, 1, 0),
                                  ag_at_home_principal = ifelse(ag_principal == 1 & working_at_home_principal == 1, 1, 0),
                                  nonag_at_home_principal = ifelse(ag_principal == 0 & working_at_home_principal == 1, 1, 0),
                                  ag_away_principal = ifelse(ag_principal == 1 & working_away_principal == 1, 1, 0),
                                  nonag_away_principal = ifelse(ag_principal == 0 & working_away_principal == 1, 1, 0),
                                  manuf_principal = ifelse((round < 68 & principal_nic_cat >= 15 & principal_nic_cat <= 37) | 
                                                        (round == 68 & principal_nic_cat >= 10 & principal_nic_cat <= 33), 1, 0),
                                  const_principal = ifelse((round < 68 & principal_nic_cat == 45) | 
                                                        (round == 68 & principal_nic_cat >= 41 & principal_nic_cat <= 43), 1, 0))
    
    nss_individual_data$ag_principal[which(is.na(nss_individual_data$ag_principal))] <- 0
    nss_individual_data$manuf_principal[which(is.na(nss_individual_data$manuf_principal))] <- 0
    nss_individual_data$const_principal[which(is.na(nss_individual_data$const_principal))] <- 0
    
    nss_individual_data <- mutate(nss_individual_data,
                                  nonag_occ_principal = working_principal - ag_principal)
    
    nss_individual_data <- merge(nss_individual_data, nss_activity_person_data, by =c("person_id2", "house_id", "round"), all.x = T)
    rm(list = "nss_activity_person_data")
                          
    #education information at individual level
    setwd(crosswalk_directory)
    educ_crosswalk <- read_csv("educ_collapse_crosswalk.csv")
    colnames(educ_crosswalk)[colnames(educ_crosswalk) == 'educ'] <- 'educ_gen'
    nss_individual_data$educ_gen <- as.numeric(nss_individual_data$educ_gen)
    nss_individual_data <- merge(nss_individual_data, educ_crosswalk, by = c("round", "educ_gen"), all.x = T)
    
    #remove education column and replace with educ2 
    nss_individual_data <- select(nss_individual_data, -educ_gen, -educ2_desc)
    colnames(nss_individual_data)[colnames(nss_individual_data) == 'educ2'] <- 'educ_gen_cat'
    
  } 
  if(merge_household10){
    setwd(input_directory)
    nss_household_data <- read_csv("nss10_6068_household_key.csv")
    nss_household_data <- select(nss_household_data, 
                                 -schedule, -nic, -nco, -house_type,
                                  -statename, -districtname)
    
    nss_individual_data <- merge(nss_individual_data, nss_household_data, by = c("round", "house_id"))
    nss_individual_data <- nss_individual_data[which(nss_individual_data$sector == 1),]
    rm(list = "nss_household_data")
  }
  if(merge_weather10){
    setwd(input_directory)
    tmax_data <- read_csv("era5_tmax_variables_2003_2012_distid3_year_key_all.csv")
    tmax_data <- tmax_data %>%
      select(-matches("neither|two_ya|three_ya"))
    
    for(year_name in c("last_y", "this_y")){
      for(grow_name in c("grow", "nogrow")){
        name_gt100 <- paste("tmax_gt100", ".", year_name, grow_name, sep= "")
        name_gt110 <- paste("tmax_gt110", ".", year_name, grow_name, sep= "")
        name_100_110 <- paste("tmax_100_110", ".", year_name, grow_name, sep= "")
        tmax_data[,name_gt100] <- as.data.frame(tmax_data[,name_100_110] + tmax_data[,name_gt110])
        
      }
    }
    
    nss_individual_data <- merge(nss_individual_data, tmax_data, by = c("distid3", "year"))
    rm(list = "tmax_data")
    prec_data <- read_csv("era5_prec_variables_2003_2012_distid3_year_key_all.csv")
    prec_data <- prec_data %>%
      select(-matches("neither|two_ya|three_ya"))
    nss_individual_data <- merge(nss_individual_data, prec_data, by = c("distid3", "year"))
    rm(list = "prec_data")
    
    
    
    setwd(input_directory)
    cpial <- read_csv("cpial_2003_2012.csv")
    nss_individual_data <- merge(nss_individual_data, cpial, by = c("year", "month"), all.x = T)
    nss_individual_data <- mutate(nss_individual_data, 
                                  ag_wage68 = ag_wage * cpi_al_2012dec,
                                  nonag_wage68 = nonag_wage * cpi_al_2012dec,
                                  school_age = ifelse(age >= 5 & age <= 16, 1, 0),
                                  work_age = ifelse(age >= 14 & age <= 65, 1, 0))
    
    setwd(output_directory)
    write_csv(nss_individual_data, "nss10_employment_era5_weather_2004_2012_analysis.csv")
  }
}

#nfhs
if(merge_weather_nfhs){
  ####################
  #all age mortality
  ####################
  #load the data
  setwd(input_directory)
  nfhs_mortality_data <- read_dta("nfhs_death_rate_rural_district_n4_year_2012_2020.dta")
  tmax_data <- read_csv("era5_tmax_variables_2009_2020_district_n4_year_key_all.csv")
  prec_data <- read_csv("era5_prec_variables_2009_2020_district_n4_year_key_all.csv")
  
  #merge the data
  nfhs_mortality_data <- left_join(nfhs_mortality_data, tmax_data, by = c("district_n4", "year"))
  nfhs_mortality_data <- left_join(nfhs_mortality_data, prec_data, by = c("district_n4", "year"))
  
  #remove far off islands 
  nfhs_mortality_data <- nfhs_mortality_data[which(nfhs_mortality_data$statename4 != "Andaman and Nicobar Islands" & nfhs_mortality_data$statename4 != "Lakshadweep" ),]
  
  #save
  setwd(output_directory)
  write_csv(nfhs_mortality_data, "nfhs_mortality_era5_weather_2012_2020_analysis.csv")
  
  ####################
  #mortality by age
  ####################
  #load the data
  setwd(input_directory)
  nfhs_mortality_data <- read_dta("nfhs_death_rate_by_age_rural_district_n4_year_2012_2020.dta")
  tmax_data <- read_csv("era5_tmax_variables_2009_2020_district_n4_year_key_all.csv")
  prec_data <- read_csv("era5_prec_variables_2009_2020_district_n4_year_key_all.csv")
  
  #merge the data
  nfhs_mortality_data <- left_join(nfhs_mortality_data, tmax_data, by = c("district_n4", "year"))
  nfhs_mortality_data <- left_join(nfhs_mortality_data, prec_data, by = c("district_n4", "year"))
  
  #remove far off islands 
  nfhs_mortality_data <- nfhs_mortality_data[which(nfhs_mortality_data$statename4 != "Andaman and Nicobar Islands" & nfhs_mortality_data$statename4 != "Lakshadweep" ),]
  
  #save
  setwd(output_directory)
  write_csv(nfhs_mortality_data, "nfhs_mortality_by_age_era5_weather_2012_2020_analysis.csv")
  
  
  ####################
  #infant mortality, last year
  ####################
  #load the data
  setwd(input_directory)
  nfhs_infant_mortality_data <- read_dta("nfhs_infant_death_rate_rural_district_n4_year_2009_2018.dta")
  tmax_data <- read_csv("era5_tmax_variables_2009_2020_district_n4_year_key_all.csv")
  prec_data <- read_csv("era5_prec_variables_2009_2020_district_n4_year_key_all.csv")
  
  #rename birth_year to year for the merge 
  setnames(nfhs_infant_mortality_data, c("birth_year"), 
           c("year"))
  
  #merge the data
  nfhs_infant_mortality_data <- left_join(nfhs_infant_mortality_data, tmax_data, by = c("district_n4", "year"))
  nfhs_infant_mortality_data <- left_join(nfhs_infant_mortality_data, prec_data, by = c("district_n4", "year"))
  
  #remove far off islands 
  nfhs_infant_mortality_data <- nfhs_infant_mortality_data[which(nfhs_infant_mortality_data$statename4 != "Andaman and Nicobar Islands" & nfhs_infant_mortality_data$statename4 != "Lakshadweep" ),]
  
  #save
  setwd(output_directory)
  write_csv(nfhs_infant_mortality_data, "nfhs_infant_mortality_era5_weather_2009_2018_analysis.csv")
  
  ####################
  #infant mortality, in utero
  ####################
  #load the data
  setwd(input_directory)
  nfhs_infant_mortality_data <- read_dta("nfhs_infant_death_individual_rural_district_n4_year_2009_2018.dta")
  tmax_data <- read_csv("era5_tmax_2009_2020_inutero_district_n4_yearmonth_key.csv")
  prec_data <- read_csv("era5_prec_2009_2020_inutero_district_n4_yearmonth_key.csv")
  
  #rename birth_year to year for the merge 
  setnames(nfhs_infant_mortality_data, c("birth_year", "birth_month"), 
           c("year", "month"))
  
  #merge the data
  nfhs_infant_mortality_data <- left_join(nfhs_infant_mortality_data, tmax_data, by = c("district_n4", "year", "month"))
  nfhs_infant_mortality_data <- left_join(nfhs_infant_mortality_data, prec_data, by = c("district_n4", "year", "month"))
  
  #remove far off islands 
  nfhs_infant_mortality_data <- nfhs_infant_mortality_data[which(nfhs_infant_mortality_data$statename4 != "Andaman and Nicobar Islands" & nfhs_infant_mortality_data$statename4 != "Lakshadweep" ),]
  
  #rename year back to birth_year 
  setnames(nfhs_infant_mortality_data, c("year", "month"),
           c("birth_year", "birth_month"))
  
  
  #save
  setwd(output_directory)
  write_csv(nfhs_infant_mortality_data, "nfhs_infant_mortality_inutero_era5_weather_2009_2018_analysis.csv")
}



#icrisat
if(merge_weather_icrisat){
  #all crops
  setwd(input_directory)
  icrisat_data <- read_csv("icrisat_districtyear_build.csv")
  prec_data <- read_csv("era5_prec_variables_1980_2012_icrisat_district_year_key_all.csv")
  temp_data <- read_csv("era5_tmax_variables_1980_2012_icrisat_district_year_key_all.csv")
  icrisat_data <- merge(icrisat_data, prec_data, by = c("icrisat_district", "year"))
  icrisat_data <- merge(icrisat_data, temp_data, by = c("icrisat_district", "year"))
  setwd(output_directory)
  write_csv(icrisat_data, "icrisat_yield_era5_weather_1981_2011_analysis.csv")
}



